library(tidyverse)
job <- read_csv('dataset/fake_job_postings.csv')
Parsed with column specification:
cols(
job_id = [32mcol_double()[39m,
title = [31mcol_character()[39m,
location = [31mcol_character()[39m,
department = [31mcol_character()[39m,
salary_range = [31mcol_character()[39m,
company_profile = [31mcol_character()[39m,
description = [31mcol_character()[39m,
requirements = [31mcol_character()[39m,
benefits = [31mcol_character()[39m,
telecommuting = [32mcol_double()[39m,
has_company_logo = [32mcol_double()[39m,
has_questions = [32mcol_double()[39m,
employment_type = [31mcol_character()[39m,
required_experience = [31mcol_character()[39m,
required_education = [31mcol_character()[39m,
industry = [31mcol_character()[39m,
`function` = [31mcol_character()[39m,
fraudulent = [32mcol_double()[39m
)
job
glimpse(job)
Rows: 17,880
Columns: 18
$ job_id [3m[38;5;246m<dbl>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,...
$ title [3m[38;5;246m<chr>[39m[23m "Marketing Intern", "Customer Service - Cloud Video Production", "Commissioning Machinery Assistant (CMA)", "Account Executive - Washington DC", "Bill Review Manager", "Accounting Clerk", "Head of Content (...
$ location [3m[38;5;246m<chr>[39m[23m "US, NY, New York", "NZ, , Auckland", "US, IA, Wever", "US, DC, Washington", "US, FL, Fort Worth", "US, MD,", "DE, BE, Berlin", "US, CA, San Francisco", "US, FL, Pensacola", "US, AZ, Phoenix", "US, NJ, Jers...
$ department [3m[38;5;246m<chr>[39m[23m "Marketing", "Success", NA, "Sales", NA, NA, "ANDROIDPIT", NA, NA, NA, NA, "HR", NA, NA, "Sales", "Sales", "R&D", NA, NA, NA, NA, NA, "Engagement", "Businessfriend.com", NA, NA, "Marketing", "Medical", NA, ...
$ salary_range [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, "20000-28000", NA, NA, NA, "100000-120000", NA, NA, NA, NA, "120000-150000", NA, NA, NA, NA, NA, NA, NA, "100000-120000", NA, NA, NA, NA, NA, NA, NA, "50000-65000", NA, NA, NA, NA, N...
$ company_profile [3m[38;5;246m<chr>[39m[23m "We're Food52, and we've created a groundbreaking and award-winning cooking site. We support, connect, and celebrate home cooks, and give them everything they need in one place.We have a top editorial, busi...
$ description [3m[38;5;246m<chr>[39m[23m "Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of edi...
$ requirements [3m[38;5;246m<chr>[39m[23m "Experience with content management systems a major plus (any blogging counts!)Familiar with the Food52 editorial voice and aestheticLoves food, appreciates the importance of home cooking and cooking with t...
$ benefits [3m[38;5;246m<chr>[39m[23m NA, "What you will get from usThrough being part of the 90 Seconds team you will gain:experience working on projects located around the world with an international brandexperience working with a variety of ...
$ telecommuting [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ has_company_logo [3m[38;5;246m<dbl>[39m[23m 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,...
$ has_questions [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,...
$ employment_type [3m[38;5;246m<chr>[39m[23m "Other", "Full-time", NA, "Full-time", "Full-time", NA, "Full-time", NA, "Full-time", "Part-time", "Full-time", NA, "Full-time", "Full-time", "Full-time", "Full-time", "Full-time", NA, NA, "Full-time", NA, ...
$ required_experience [3m[38;5;246m<chr>[39m[23m "Internship", "Not Applicable", NA, "Mid-Senior level", "Mid-Senior level", NA, "Mid-Senior level", NA, "Associate", "Entry level", "Mid-Senior level", NA, "Associate", "Not Applicable", "Associate", "Execu...
$ required_education [3m[38;5;246m<chr>[39m[23m NA, NA, NA, "Bachelor's Degree", "Bachelor's Degree", NA, "Master's Degree", NA, NA, "High School or equivalent", "Bachelor's Degree", NA, "Bachelor's Degree", "Unspecified", "Bachelor's Degree", "Bachelor'...
$ industry [3m[38;5;246m<chr>[39m[23m NA, "Marketing and Advertising", NA, "Computer Software", "Hospital & Health Care", NA, "Online Media", NA, "Information Technology and Services", "Financial Services", "Information Technology and Services"...
$ `function` [3m[38;5;246m<chr>[39m[23m "Marketing", "Customer Service", NA, "Sales", "Health Care Provider", NA, "Management", NA, NA, "Customer Service", "Information Technology", NA, "Information Technology", "Other", "Sales", "Sales", "Engine...
$ fraudulent [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
summary(job)
job_id title location department salary_range company_profile description requirements benefits telecommuting has_company_logo has_questions employment_type
Min. : 1 Length:17880 Length:17880 Length:17880 Length:17880 Length:17880 Length:17880 Length:17880 Length:17880 Min. :0.0000 Min. :0.0000 Min. :0.0000 Length:17880
1st Qu.: 4471 Class :character Class :character Class :character Class :character Class :character Class :character Class :character Class :character 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 Class :character
Median : 8940 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Median :0.0000 Median :1.0000 Median :0.0000 Mode :character
Mean : 8940 Mean :0.0429 Mean :0.7953 Mean :0.4917
3rd Qu.:13410 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :17880 Max. :1.0000 Max. :1.0000 Max. :1.0000
required_experience required_education industry function fraudulent
Length:17880 Length:17880 Length:17880 Length:17880 Min. :0.00000
Class :character Class :character Class :character Class :character 1st Qu.:0.00000
Mode :character Mode :character Mode :character Mode :character Median :0.00000
Mean :0.04843
3rd Qu.:0.00000
Max. :1.00000
There are 17,880 records. 4.843% jobs posted are fraudulent. We’re dealing with a class-imbalanced problem.
There is a column named ‘function’ in the dataset which conflicts R’s base function. This can be annoying later, so we’re going to fix this. Also, the job_id variable won’t do a lot of good either, so we’re dropping this variable now.
Once done, we’ll go ahead to look for missing values.
job <- job %>%
rename(func = `function`) %>%
select(-job_id)
library(visdat)
vis_miss(job)
It’s clear to see the salary_range is the variable which has the most missing values. Let’s find out why this occured.